#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data=pd.ExcelFile('rev data for test.xlsx')
#one sheet present in Excel
data.sheet_names
#converting dataset into DataFrame
df = data.parse('data for test')
#Faamiliarising with Data!
df.head()
df.describe()
df.info()
sns.pairplot(data=df,hue='Type')
ax = df['Type'].value_counts().plot(kind='barh', figsize=(10,7),
color="coral", fontsize=12);
ax.set_alpha(0.8)
ax.set_title("Most common reasons for calling 911 in seattle?", fontsize=18)
ax.set_xlabel("Number of calls", fontsize=18);
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_width())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
# get_width pulls left or right; get_y pushes up or down
ax.text(i.get_width()+7, i.get_y()+.38, \
str(int(i.get_width()))+' ('+str(round((i.get_width()/total)*100, 2))+'%)', fontsize=15,
color='dimgrey')
# invert for largest on top
ax.invert_yaxis()
#The same with interactive visualization
df.groupby('Type')['Type'].count().iplot(kind='bar')
import cufflinks as cf
cf.go_offline()
data = []
i = 0
color_set = ['#FE9C43','#7fc97f','#fc8d62','#66c2a5' ]
for col in df['Type'].unique():
data.append(go.Scatter(x=df[df['Type'] == col]['Latitude'], y=df[df['Type'] == col]['Longitude'],
mode='markers', line=dict(color=color_set[i], width=1, dash='dash'),
marker=dict(color=color_set[i], size=10), name=col))
i += 1
layout = go.Layout(
xaxis = dict(
title = 'Latitude',
),
yaxis = dict(
title = 'Longitude',
),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
#Scatter plot on how the 4 types of calls are distributed using Seaborn
sns.lmplot(x='Latitude',y='Longitude', data=df,hue='Type',size=8)
import plotly.plotly as py
import pandas as pd
from plotly import __version__
print(__version__)
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import plotly.figure_factory as ff
from plotly.tools import FigureFactory as FF
init_notebook_mode(connected=True)
import plotly.graph_objs as go
cf.go_offline()
#utility function to return some differentiate categories (those numbers are random and are used for
#choosing colors for the US map visualization)
def label_rows(row):
if row['Type']=='Beaver Accident':
return 1
if row['Type']=='Seal Attack':
return 8
if row['Type']=='Latte Spills':
return 20
if row['Type']=='Marshawn Lynch Sighting':
return 44
#creating a new column from the existing column for building a better model
df['type_label'] = df.apply (lambda row: label_rows(row),axis=1)
data = [ dict(
type = 'scattergeo',
locationmode = 'US-states',
lon = df['Longitude'],
lat = df['Latitude'],
#text = df['Type'],
mode = 'markers',
marker = dict(
size = 8,
opacity = 0.8,
reversescale = True,
autocolorscale = False,
symbol = 'circle',
line = dict(
width=0.5
),
cmin = 0,
color = df['type_label'],
cmax = df['type_label'].max(),
))]
layout = dict(
geo = dict(
scope = 'north west america',
showland = True,
landcolor = "rgb(212, 212, 212)",
subunitcolor = "rgb(255, 255, 255)",
countrycolor = "rgb(255, 255, 255)",
showlakes = True,
lakecolor = "blue",
showsubunits = True,
showcountries = True,
showocean=True,
resolution = 50,
projection = dict(
type = 'conic conformal',
rotation = dict(
lon = -100
)
),
lonaxis = dict(
showgrid = True,
gridwidth = 0.5,
range= [ -160.0, -55.0 ],
dtick = 5
),
lataxis = dict (
showgrid = True,
gridwidth = 0.5,
range= [ 35.0, 75.0 ],
dtick = 5
)
),
title = '911 calls in Seattle',
)
fig = { 'data':data, 'layout':layout }
iplot(fig)
#This is an interactive viz, please zoom in to seattle state(the orange dot on the viz) to visualize all the points
#with different colors
data = []
i = 0
color_set = ['#FE9C43','#7fc97f','#fc8d62','#66c2a5' ]
for col in df['Type'].unique():
data.append(go.Scatter(x=df[df['Type'] == col]['Latitude'], y=df[df['Type'] == col]['Longitude'],
mode='markers', line=dict(color=color_set[i], width=1, dash='dash'),
marker=dict(color=color_set[i], size=10), name=col))
i += 1
layout = go.Layout(
yaxis = dict(
title = 'Latitude',
),
xaxis = dict(
title = 'Longitude',
),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
From the above plot we see that some points look mislabled -mostly the blue dots on red cluster i.e. Marshawn Lynch is sighted where a lot of 911 calls are recorded for Latte Spills.
Using KMeans Clustering using only Latitude and Longitude features to see if we can predict why a resident called 911
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=4) #I took number of clusters=4 because we already know there are only 4 types of 911 calls
dat=['Latitude','Longitude']
kmeans.fit(df[dat]) #model fitting
kmeans.cluster_centers_ #predicted cluster centers
#predicted labels
kmeans.labels_
df.describe()
#utility function to convert category into numerical data. I could also use pd.get_dummies()
def cluster(row):
if row['Type']=='Beaver Accident':
return 0
elif row['Type']=='Seal Attack':
return 1
elif row['Type']=='Latte Spills':
return 2
elif row['Type']=='Marshawn Lynch Sighting':
return 3
else:
return 4
df['type_label'] = df.apply (lambda row: cluster(row),axis=1)
#Metrics for evaluating the model
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(df['type_label'],kmeans.labels_))
print(classification_report(df['type_label'],kmeans.labels_))
From the above metrics, The kmeans model did a decent job with few mis classifications.
number of correct classifications: 1413
number of misclassifications: 101
percentage of correct prediction: 93.3%
Major number of misclassifications (47 from the confusion matrix) are Marshawn Lynch sightings.These are predicted as Latte Spills.
This also reconfirms our hypothesis that the type of accident calls can be predicted just from the available co-ordinates.
Yes, It concerns us a bit because when the 'Latitude' and 'Longitude' are converted as numbers, they loose their real world meaning. But, as our model performed well with 94%, we can manage with 'Latitude' and 'Longitude' as distances. But if our model doesnt perform decent enough, then we need to add some functions which convert Lat,Long to some other form like a vector, which can be more relate to Geographical co-ordinates.
#lets see how the number of clusters changes the error rate of of model.
error_rate = []
for i in range(1,10):
kmeans=KMeans(n_clusters=i)
dat=['Latitude','Longitude']
kmeans.fit(df[dat],df['type_label'])
pred_i = kmeans.predict(X_test)
error_rate.append(np.mean(pred_i!=y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue',linestyle='dashed',marker='o',
markerfacecolor='red',markersize=10)
plt.title("Error Rate vs K Value - KMeans")
plt.xlabel('K')
plt.ylabel('Error Rate')
From the above plot, we see that when the number of clusters = 4 or 5 or 6, There is least error-rate. k=1 also has least error rate but we can eliminate it because - all the points belongs to one category, doesn't make sense.
Lets build a KNN model to see if we can improve in the previous model
#for KNN, we need to normalize the features so that it won't be biased. For that we use StandardScaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
dat=['Latitude','Longitude']
scaler.fit(df[dat])
scaled_features = scaler.transform(df[dat])
#normalized features
scaled_features
df_feat = pd.DataFrame(scaled_features,columns=['Latitude','Longitude'])
df_feat.head()
#model evaluation
from sklearn.cross_validation import train_test_split
#lets split the model into train and test sets
X=df_feat
y=df['type_label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state=10)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train,y_train)
pred_k = knn.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred_k))
print(classification_report(y_test,pred_k))
Hey! This model does better job in classifying correctly than the previous model. This can be seen from the number of misclasifications in confusion matrix, precision and recall values.
number of correct classifications: 431 #(only Test data)
number of misclassifications: 24
percentage of correct prediction: 94.7%
error_rate = []
for i in range(1,10):
knn=KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i!=y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,10),error_rate,color='blue',linestyle='dashed',marker='o',
markerfacecolor='red',markersize=10)
plt.title("Error Rate vs K Value - KNN")
plt.xlabel('K')
plt.ylabel('Error Rate')
This model predicts the category by seeing its 3 nearest neighbors label. Lets see again with k=3
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred_k))
print(classification_report(y_test,pred_k))
Lets see with Random Forests because we know most of the times Ensemble models does pretty good job when compared with other models. First lets check how a decision tree classifier performs and then I'll compare it with Random Forest classifier
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions=dtree.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
number of correct classifications: 425 number of misclassifications: 30 percentage of correct prediction: 93.4%
The decisionTree classifier did not perform better than what Kmeans and KNN models did.
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=250)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test,rfc_pred))
print(classification_report(y_test,rfc_pred))
number of correct classifications: 428
number of misclassifications: 27
percentage of correct prediction: 94%
Even Random Forest classifier also didn't perform very well. But also, It is little hard to tell which performed better because it really depends on what you value, whether you value Precision or Recall. Its probably more important to realise that 911 calls for Beaver accidents and seal attacks, which are more dangerous and which need immediate attention are not classified as(very few are classified like this) something less emergency like calls from sighting Marshawn Lynch or Latte Spills.
But again, it really depends upon the situation and what costs are associated with those decisions.
The main reason for not misclassifications is Marshawn Lynch sightings are classified as Latte spills. Our hypothesis also reconfirms that Marshawn Lynch sightings points on Latte Spills cluster!